Purpose:

Prior to running this code:

# Set Working Directory before starting

# Immcantation
suppressPackageStartupMessages(library(alakazam))
suppressPackageStartupMessages(library(shazam))
suppressPackageStartupMessages(library(tigger))
suppressPackageStartupMessages(library(dowser))
suppressPackageStartupMessages(library(airr))

suppressPackageStartupMessages(library(glmGamPoi))
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(dittoSeq))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(kableExtra))
suppressPackageStartupMessages(library(devtools))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(impactSingleCellToolkit))

# Immcantation Results
bcr_heavy_fh <- "data/results_immcantation/BCR_CSFPB_heavy_germ-pass.tsv"
bcr_light_fh <- "data/results_immcantation/BCR_CSFPB_light_germ-pass.tsv"
tcr_beta_fh  <- "data/results_immcantation/TCR_CSFPB_heavy_germ-pass.tsv"
tcr_alpha_fh <- "data/results_immcantation/TCR_CSFPB_light_germ-pass.tsv"

# seed number
seed.number <- 12345

# Workshop specific functions
source("resources/functions_workshop.R")

1. Load RNA-Seq object

rnaseq_obj <- readRDS("objects/seurat.obj.processed.rds")

2. Load and format TCR/BCR repertoire results

a. Load immcantations results

# TCR
db.tra   <- read_rearrangement(tcr_alpha_fh) %>% as.data.frame()
db.trb   <- read_rearrangement(tcr_beta_fh) %>% as.data.frame()
# BCR
db.heavy <- read_rearrangement(bcr_heavy_fh) %>% as.data.frame()
db.light <- read_rearrangement(bcr_light_fh) %>% as.data.frame()

b. filter contigs and merge all chains

  • All contigs are in frame
bcr_df <- mergeVDJresults(db.heavy,db.light,umi.thresh = 3,assay = "bcr")
tcr_df <- mergeVDJresults(db.trb,db.tra,umi.thresh = 2, assay = "tcr")

# format VDJ unique cell IDs to match scRNA-Seq (function only for this workshop)
formatUniqueIDcolum <- function(uni_id_column,sep = ":") {
  samples <- tstrsplit(uni_id_column, sep)[[1]]
  cells   <- tstrsplit(uni_id_column, sep)[[2]]
  fmt_ids <- paste(cells,samples,sep = "-")
  
  return(fmt_ids)
}

bcr_df$unique_cell_id <- formatUniqueIDcolum(bcr_df$unique_cell_id)
tcr_df$unique_cell_id <- formatUniqueIDcolum(tcr_df$unique_cell_id)

3. Only analyze cells shared across all datasets

a. get vector of overlapping cells

cells_rnaseq      <- colnames(rnaseq_obj)
cells_vdj         <- unique(c(bcr_df$unique_cell_id,
                              tcr_df$unique_cell_id))
overlapping_cells <- cells_rnaseq[cells_rnaseq %in% cells_vdj]

b. filter rna-seq and vdj data

  • Even after filtering for cells that overlap with BCR/TCR data, there are still sparse clusters of cells annotated as different cell types
# generate UMAP of cell type annotations pre-filtering
p1_pre_filter <- DimPlot(object = rnaseq_obj, group.by="main_bencode", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")

# only keep cells with vdj
rnaseq_obj     <- rnaseq_obj[,colnames(rnaseq_obj) %in% overlapping_cells]
bcr_df         <- bcr_df[bcr_df$unique_cell_id %in% overlapping_cells,]
tcr_df         <- tcr_df[tcr_df$unique_cell_id %in% overlapping_cells,]

# post filter umap
p2_post_filter <- DimPlot(object = rnaseq_obj, group.by="main_bencode", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")

plot_grid(p1_pre_filter,p2_post_filter,ncol = 2)

4. Add columns to VDJ data

# Add sample column
tcr_df$sample <- tstrsplit(tcr_df$unique_cell_id,"-")[[2]]
bcr_df$sample <- tstrsplit(bcr_df$unique_cell_id,"-")[[2]]

# Add column for CSF/PB 
tcr_df$COMPARTMENT <- tstrsplit(tcr_df$sample,"_")[[2]]
bcr_df$COMPARTMENT <- tstrsplit(bcr_df$sample,"_")[[2]]
tcr_df$COMPARTMENT[tcr_df$COMPARTMENT %in% "PBMC"] <- "PB"
bcr_df$COMPARTMENT[bcr_df$COMPARTMENT %in% "PBMC"] <- "PB"

# Add Status Disease/Healthy
tcr_df$STATUS <- tstrsplit(tcr_df$sample,"_")[[1]] %>% as.character()
tcr_df$STATUS[tcr_df$STATUS %in% "32"] <- "Healthy"
tcr_df$STATUS[tcr_df$STATUS %in% "28"] <- "Disease"

bcr_df$STATUS <- tstrsplit(bcr_df$sample,"_")[[1]] %>% as.character()
bcr_df$STATUS[bcr_df$STATUS %in% "32"] <- "Healthy"
bcr_df$STATUS[bcr_df$STATUS %in% "28"] <- "Disease"

5. Re-cluster

set.seed(seed.number)
rnaseq_obj <- scaleAndClusterSeuratObject(rnaseq_obj,dims = 1:4,npca = 5,resolution = 0.3,
                                          normalize = F,dim.reduction = T)
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5316
## Number of edges: 146863
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9040
## Number of communities: 10
## Elapsed time: 0 seconds
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
p3_reclustered <- DimPlot(object = rnaseq_obj, group.by="main_bencode", pt.size=1.0, label.size = 5,label = T,reduction = "umap")
p4_reclustered <- DimPlot(object = rnaseq_obj, group.by="fine_bencode", pt.size=1.0,label.size = 5, label = T,reduction = "umap")

plot_grid(p3_reclustered,p4_reclustered,ncol = 2)

—————

Manual cell type annotation

1. Look at the data

a. Add columns and look at UMAP plots

# Add compartment column
rnaseq_obj$compartment <- tstrsplit(rnaseq_obj$sample,"_")[[2]]
rnaseq_obj$compartment <- str_replace_all(rnaseq_obj$compartment,"PBMC","PB")

# Add patient status column
rnaseq_obj$status      <- tstrsplit(rnaseq_obj$sample,"_")[[1]]
rnaseq_obj$status[rnaseq_obj$status %in% "32"] <- "Healthy"
rnaseq_obj$status[rnaseq_obj$status %in% "28"] <- "Disease"

# loc 
loc_cols       = c("CSF" = "blue", "PB" = "red")
diagnosis_cols = c("Disease" = "purple","Healthy" = "orange")

p1 <- DimPlot(object = rnaseq_obj, group.by="seurat_clusters", pt.size=1.0, label.size = 10, alpha = 0.7,label = T,reduction = "umap")
p2 <- DimPlot(object = rnaseq_obj, group.by="compartment", pt.size=1.0, label.size = 10, alpha = 0.7,label = F,cols = loc_cols,reduction = "umap")
p3 <- DimPlot(object = rnaseq_obj, group.by="status", pt.size=1.0, label.size = 10,alpha = 0.7,label = F,cols = diagnosis_cols,reduction = "umap")

plot_grid(p1,p2,p3,ncol = 3)

b. feature/violin of canonical genes

  • For an extended list of gene markers for cell types, refer to Celltype_Markers.xlsx in the resources directory
  • PPBP = Platelets, which are noise
tcell_panel           <- c("CD3E","CD3D","CD3G","CD4","CD8A","CD8B")
bcell_panel           <- c("MS4A1","CD79A","CD79B","CD19","IGHM","IGHA1")
other_celltypes_panel <- c("CD14","LYZ","MS4A7","FCER1A","APOE","PPBP")

B cell gene panel

FeaturePlot(rnaseq_obj,features = bcell_panel,ncol = 3)

T cell gene panel

FeaturePlot(rnaseq_obj,features = tcell_panel,ncol = 3)

gene panel for other cell types

  • Should not see an defined clusters for other cells types since these data should only contain B cells and T cells
  • CD14+ Monocytes - LYZ, CD14
  • CD16+ Monocytes - MS4A7
  • Monocyte derived Dendritic Cells - FCGR3A,MS4A7
  • Macrophage - APOE
  • Platelet - PPBP
FeaturePlot(rnaseq_obj,features = other_celltypes_panel,ncol = 3)

2. Add main cell types

main_annot <- names(rnaseq_obj$orig.ident)
main_annot[main_annot %in% bcr_df$unique_cell_id] <- "B cells"
main_annot[main_annot %in% tcr_df$unique_cell_id] <- "T cells"
names(main_annot) <- names(rnaseq_obj$orig.ident)
rnaseq_obj$main.celltype.annot <- main_annot

p4 <- DimPlot(object = rnaseq_obj, group.by="seurat_clusters", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")
p5 <- DimPlot(object = rnaseq_obj, group.by="main.celltype.annot", pt.size=1.0, label.size = 5, alpha = 0.7,label = T,reduction = "umap")

plot_grid(p4,p5, ncol = 2)

3. Add in Subtype annotations

a. Violin of CD4/CD8 genes

subtype_annotations <- colnames(rnaseq_obj)

Idents(rnaseq_obj) <- rnaseq_obj$seurat_clusters
p6 <- VlnPlot(rnaseq_obj,features = c("CD4","CD8A","CD8B"),pt.size = 0,ncol = 1)

plot_grid(p6,p4,ncol = 2,rel_heights = c(8,6),rel_widths = c(6,6))

b. Define T cell subtypes based on CD8 gene expression & SingleR annotations

  • CD8 T cells are much higher in automated SingleR annotations
  • In addition, there are cell types which are mis-annotated as Macrophages, Monocytes and NK cells
  • Mis-annotated celltype cluster with B cells
  • We are going to re-annotate T cells manually based on CD8 expression
FeaturePlot(rnaseq_obj, features = c("CD8A", "CD8B"),blend = TRUE)

# SingleR Encode annotatons
singler_annot_df <- rnaseq_obj@meta.data[,c("sample","seurat_clusters","main_bencode","fine_bencode","main.celltype.annot")]

# Manual T cell annotation based on CD8 expression
CD8_pos_tcells           <- colnames(subset(rnaseq_obj, subset = CD8A > 0 | CD8B > 0))
singler_annot_df$cd8_pos <- row.names(singler_annot_df)
singler_annot_df$cd8_pos[singler_annot_df$cd8_pos %in% CD8_pos_tcells] <- "yes"
singler_annot_df$cd8_pos[! singler_annot_df$cd8_pos %in% "yes"]        <- "no"

tcell_subtype_annotation <- names(rnaseq_obj$main.celltype.annot[rnaseq_obj$main.celltype.annot %in% "T cells"])
CD8_pos_tcells <- tcell_subtype_annotation[tcell_subtype_annotation %in% CD8_pos_tcells]
CD4_tcells     <- tcell_subtype_annotation[! tcell_subtype_annotation %in% CD8_pos_tcells]

singler_annot_df$sub.celltype.annot <- singler_annot_df$main.celltype.annot
singler_annot_df[row.names(singler_annot_df) %in% CD8_pos_tcells, "sub.celltype.annot"] <- "CD8 T cells"
singler_annot_df[row.names(singler_annot_df) %in% CD4_tcells, "sub.celltype.annot"]     <- "CD4 T cells"

# Stacked bar
# - Not all SingleR predictions for CD8+ T cells express CD8
summary_main_bencode <- singler_annot_df %>% count(seurat_clusters, main_bencode, name = "count")
levels(summary_main_bencode$main_bencode) <- sort(unique(summary_main_bencode$main_bencode))
colors_main_bencode <- c("red","darkgreen","darkblue","purple","orange","grey")
p7 <- ggplot(summary_main_bencode, aes(x = seurat_clusters, y = count, fill = main_bencode)) + geom_bar(stat = "identity",position = "fill") + scale_fill_manual(values = colors_main_bencode) + ggtitle("SingleR Cell-type Predictions")

summary_subtype <- singler_annot_df %>% count(seurat_clusters, sub.celltype.annot, name = "count")
levels(summary_subtype$sub.celltype.annot) <- sort(unique(summary_subtype$sub.celltype.annot))
colors_subtype <- c("red","darkgreen","darkblue")
p8 <- ggplot(summary_subtype, aes(x = seurat_clusters, y = count, fill = sub.celltype.annot)) + geom_bar(stat = "identity",position = "fill") + scale_fill_manual(values = colors_subtype) + ggtitle("Multiomics")

plot_grid(p7,p8,ncol = 1,align = "v")

c. Define B cell subtypes based on SingleR subtype annotations

  • SingleR
  • Training Data ENCODE
singler_annot_df_bcr <- singler_annot_df[singler_annot_df$main.celltype.annot %in% "B cells",]
uni_bcell_subtypes   <- unique(singler_annot_df$fine_bencode)
non_bcell_subtypes   <- uni_bcell_subtypes[! uni_bcell_subtypes %in% c("naive B-cells","Memory B-cells","Plasma cells")]

# For simplicity we will annotate non-B cell annotations as Naive B cells
#  - Violin plots showed that these cells have virtually no IgG, IgA or V gene expression
singler_annot_df_bcr$fine_bencode[singler_annot_df_bcr$fine_bencode %in% non_bcell_subtypes] <- "naive B-cells"

# Add B cell subtype annotations into sub.celltype.annot column
for (bsubtype in unique(singler_annot_df_bcr$fine_bencode)) {
  subset_cells <- row.names(singler_annot_df_bcr[singler_annot_df_bcr$fine_bencode %in% bsubtype,])
  singler_annot_df[row.names(singler_annot_df) %in% subset_cells,"sub.celltype.annot"] <- bsubtype
}


# Add subtype annotations back into RNA-Seq object
rnaseq_obj$sub.celltype.annot <- singler_annot_df$sub.celltype.annot

# Plot
p9  <- dittoDimPlot(rnaseq_obj, "main.celltype.annot")
p10 <- dittoDimPlot(rnaseq_obj, "sub.celltype.annot")

grid.arrange(p9,p10,nrow=1)

4. Add cell type annotation to VDJ results

# TCR 
tcell_annotation_col        <- rep("No Annot",nrow(tcr_df))
names(tcell_annotation_col) <- tcr_df$unique_cell_id

subtype_annotations_tcr <- singler_annot_df[singler_annot_df$main.celltype.annot %in% "T cells",]
for (subtype in unique(subtype_annotations_tcr$sub.celltype.annot)) {
  subtype_cells <- row.names(subtype_annotations_tcr[subtype_annotations_tcr$sub.celltype.annot %in% subtype,])
  tcell_annotation_col[names(tcell_annotation_col) %in% subtype_cells] <- subtype
}
tcr_df$SUBTYPE_ANNOTATION <- tcell_annotation_col

# BCR
bcell_annotation_col<- rep("No Annot",nrow(bcr_df))
names(bcell_annotation_col) <- bcr_df$unique_cell_id

subtype_annotations_bcr <- singler_annot_df[singler_annot_df$main.celltype.annot %in% "B cells",]
for (subtype in unique(subtype_annotations_bcr$sub.celltype.annot)) {
  subtype_cells    <- row.names(subtype_annotations_bcr[subtype_annotations_bcr$sub.celltype.annot %in% subtype,])
  bcell_annotation_col[names(bcell_annotation_col) %in% subtype_cells] <- subtype
}
bcr_df$SUBTYPE_ANNOTATION <- bcell_annotation_col

5. define expanded clonotypes

a. generate clone count data frames

clone_counts_tcr <- createCloneCountTable(tcr_df)
clone_counts_bcr <- createCloneCountTable(bcr_df)

# Can identify expanded clonotypes in the CSF with a high CSF:PB ratio
clone_count_subset <- clone_counts_tcr[clone_counts_tcr$cell_count > 1 & clone_counts_tcr$CSF_count > 1, ]

b. create categories for clonal expansion

  • Highly expanded - cell count > 5
  • Moderate - 1 < cells count < 5
  • Non-expanded - cell count = 1
tcr_df <- addExpansionCategories(tcr_df,clone_counts_tcr)
bcr_df <- addExpansionCategories(bcr_df,clone_counts_bcr)

c. add expansion categories to scRNA-Seq object

expansion_categ_col <- paste(colnames(rnaseq_obj),rnaseq_obj$main.celltype.annot)
uni_expansion_categ <- unique(tcr_df$EXPANSION_CATEGORY)

for (categ in unique(tcr_df$EXPANSION_CATEGORY)) {
  cells_categ <- c()
  cells_tcr <- tcr_df[tcr_df$EXPANSION_CATEGORY %in% categ,"unique_cell_id"]
  cells_tcr <- unique(paste(cells_tcr,"T cells"))
  cells_bcr <- bcr_df[bcr_df$EXPANSION_CATEGORY %in% categ,"unique_cell_id"]
  cells_bcr <- unique(paste(cells_bcr,"B cells"))
  
  if(length(cells_tcr)> 0) {cells_categ <- c(cells_categ, cells_tcr)}
  if(length(cells_bcr)> 0) {cells_categ <- c(cells_categ, cells_bcr)}
  
  expansion_categ_col[expansion_categ_col %in% cells_categ] <- categ
}
names(expansion_categ_col) <- colnames(rnaseq_obj)
rnaseq_obj$expansion_category <- expansion_categ_col

6. Look at proportions

a. subset B cells

# add column for Diagnosis + Subtype & Diagnois + Compartment
rnaseq_obj$status.subtype     <- paste(rnaseq_obj$status,rnaseq_obj$sub.celltype.annot)
rnaseq_obj$status.compartment <- paste(rnaseq_obj$status,rnaseq_obj$compartment)

cells <- names(rnaseq_obj$main.celltype.annot[rnaseq_obj$main.celltype.annot %in% "B cells"])
bcell_obj <- rnaseq_obj[,cells]
cells <- names(rnaseq_obj$main.celltype.annot[rnaseq_obj$main.celltype.annot %in% "T cells"])
tcell_obj <- rnaseq_obj[,cells]

b. sample breakdown

  • Disease Patient has an increased proportion of B cells in the CSF of which most are memory B cells
p11 <- dittoBarPlot(rnaseq_obj, "sub.celltype.annot", group.by = "status.compartment", main = "All cells",color.panel = dittoColors()[1:4])
p12 <- dittoBarPlot(bcell_obj, "sub.celltype.annot", group.by = "status.compartment", main = "B cells",color.panel = dittoColors()[3:4])


plot_grid(p11,p12,nrow=2,align = "v")

c. clonotype expansion breakdown

  • Expanded T cell populations are observed in all compartments, while expanded B cells show more restriction
p11 <- dittoBarPlot(tcell_obj, "expansion_category", group.by = "status.compartment", main = "T cells")
p12 <- dittoBarPlot(bcell_obj, "expansion_category", group.by = "status.compartment", main = "B cells")

grid.arrange(p11,p12,nrow=2)

—————

SAVE Processed objects

save(rnaseq_obj,tcell_obj,bcell_obj,bcr_df,tcr_df,clone_counts_tcr,clone_counts_bcr,file = "objects/processed_objects.RData")